I started with preparing my DFM.

tot <- read.csv("clothing_reviews23.csv")

tot$text <- gsub("'"," ",tot$text) 

myCorpus <- corpus(tot)

tok2 <- tokens(myCorpus , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))

Dfm <- dfm(tok2 )
topfeatures(Dfm )
## dress  love     t  size     s   top   fit great  like  wear 
##  2600  2299  1980  1900  1890  1805  1750  1709  1553  1434
Dfm <- dfm_remove(Dfm , min_nchar=2)
topfeatures(Dfm )
##  dress   love   size    top    fit  great   like   wear   just fabric 
##   2600   2299   1900   1805   1750   1709   1553   1434   1254   1175
Dfm <- dfm_trim(Dfm,  min_termfreq = 5, verbose=TRUE)
## Removing features occurring:
##   - fewer than 5 times: 4,986
##   Total features removed: 4,986 (68.2%).

Then I applied the GloVe algorithm via Quanteda. I also weighted out FCM.

Dfm_vocab <- featnames(Dfm )
str(Dfm_vocab)
##  chr [1:2328] "absolutely" "wonderful" "silky" "sexy" "comfortable" "love" ...
mov_tokens <- tokens(myCorpus) 

mov_tokens2 <- tokens_select(mov_tokens, Dfm_vocab, padding = TRUE)

fcmat_news <- fcm(mov_tokens2, context = "window", count = "weighted", weights = 1/(1:5))

After that I estimated Word Embedding via Glove. I increased the default values of the dimensions up to 150 and maximum number of co-occurrences up to 15. In addition, I put 75 as the number of interactions. I also took a sum of main and context vectors to get higher quality embeddings.

glove <- GlobalVectors$new(rank=150, x_max=15)
set.seed(123)
system.time(glove_main <- glove$fit_transform(fcmat_news, n_iter = 75, convergence_tol = 0.01, n_threads = 1))
wv_context <- glove$components
dim(wv_context)
glove_main <- glove_main + t(wv_context)

Then I created a dataframe out of the Glove results. After that I defined a plot function for the second and third dimension and created the plot for several words that I thought could be found in clothes reviews. From it, we can see some a cluster with words “texture”, “comfortable” and “cozy”, which makes sense.

glove_dataframe <- as.data.frame(glove_main)
nrow(glove_dataframe)
## [1] 2878
colnames(glove_dataframe )
##   [1] "V1"   "V2"   "V3"   "V4"   "V5"   "V6"   "V7"   "V8"   "V9"   "V10" 
##  [11] "V11"  "V12"  "V13"  "V14"  "V15"  "V16"  "V17"  "V18"  "V19"  "V20" 
##  [21] "V21"  "V22"  "V23"  "V24"  "V25"  "V26"  "V27"  "V28"  "V29"  "V30" 
##  [31] "V31"  "V32"  "V33"  "V34"  "V35"  "V36"  "V37"  "V38"  "V39"  "V40" 
##  [41] "V41"  "V42"  "V43"  "V44"  "V45"  "V46"  "V47"  "V48"  "V49"  "V50" 
##  [51] "V51"  "V52"  "V53"  "V54"  "V55"  "V56"  "V57"  "V58"  "V59"  "V60" 
##  [61] "V61"  "V62"  "V63"  "V64"  "V65"  "V66"  "V67"  "V68"  "V69"  "V70" 
##  [71] "V71"  "V72"  "V73"  "V74"  "V75"  "V76"  "V77"  "V78"  "V79"  "V80" 
##  [81] "V81"  "V82"  "V83"  "V84"  "V85"  "V86"  "V87"  "V88"  "V89"  "V90" 
##  [91] "V91"  "V92"  "V93"  "V94"  "V95"  "V96"  "V97"  "V98"  "V99"  "V100"
## [101] "V101" "V102" "V103" "V104" "V105" "V106" "V107" "V108" "V109" "V110"
## [111] "V111" "V112" "V113" "V114" "V115" "V116" "V117" "V118" "V119" "V120"
## [121] "V121" "V122" "V123" "V124" "V125" "V126" "V127" "V128" "V129" "V130"
## [131] "V131" "V132" "V133" "V134" "V135" "V136" "V137" "V138" "V139" "V140"
## [141] "V141" "V142" "V143" "V144" "V145" "V146" "V147" "V148" "V149" "V150"
glove_dataframe$word <- row.names(glove_dataframe )


plot_words <- function(words, glove_dataframe){
  plot(0, 0, xlim=c(-0.5, 0.5), ylim=c(-0.5,0.5), type="n",
       xlab="Second dimension", ylab="Third dimension")
  for (word in words){
    vector <- as.numeric(glove_dataframe[glove_dataframe$word==word,2:3])
    text(vector[1], vector[2], labels=word)
  }
}

plot_words(c("comfortable", "fit", "quality", "fabric", "size", "trendy", "texture", "cozy"), glove_dataframe)

Afterward I ploted all the dimensions together. The plot is a mess, but some clusters still can be seen, for instance colors around ~4.5 on v1 and 0 on v2.

set.seed(123)
system.time(tsne <-  Rtsne(glove_main, perplexity = 50))
## пользователь      система       прошло 
##        11.73         0.00        12.40
str(tsne)
## List of 14
##  $ N                  : int 2878
##  $ Y                  : num [1:2878, 1:2] -0.93948 0.01981 0.4322 0.00947 2.84562 ...
##  $ costs              : num [1:2878] 0.000704 0.000716 0.001199 0.000863 0.001413 ...
##  $ itercosts          : num [1:20] 74 74 74 74 74 ...
##  $ origD              : int 50
##  $ perplexity         : num 50
##  $ theta              : num 0.5
##  $ max_iter           : num 1000
##  $ stop_lying_iter    : int 250
##  $ mom_switch_iter    : int 250
##  $ momentum           : num 0.5
##  $ final_momentum     : num 0.8
##  $ eta                : num 200
##  $ exaggeration_factor: num 12
##  - attr(*, "class")= chr [1:2] "Rtsne" "list"
tsne_plot <- tsne$Y
tsne_plot  <- as.data.frame(tsne_plot)
str(tsne_plot)
## 'data.frame':    2878 obs. of  2 variables:
##  $ V1: num  -0.93948 0.01981 0.4322 0.00947 2.84562 ...
##  $ V2: num  -3.85 -1.32 2.08 -1.22 -4.16 ...
tsne_plot$word  <- row.names(glove_main)
str(tsne_plot)
## 'data.frame':    2878 obs. of  3 variables:
##  $ V1  : num  -0.93948 0.01981 0.4322 0.00947 2.84562 ...
##  $ V2  : num  -3.85 -1.32 2.08 -1.22 -4.16 ...
##  $ word: chr  "Absolutely" "wonderful" "silky" "sexy" ...
tsne_plot2 <- ggplot(tsne_plot, aes(x = V1, y = V2, label = word)) + geom_text(size = 3)
tsne_plot2

The see positions of these words i used the following. It also can be used to see the “closeness” of any words.

tsne_plot[which(tsne_plot$word=="dark"),]
##          V1        V2 word
## 430 4.46632 0.6483829 dark
tsne_plot[which(tsne_plot$word=="red"),]
##           V1         V2 word
## 907 4.393802 0.06918076  red

I also transformed the ggplot into an interacting plotly plot.

ggplotly(tsne_plot2)

Then I computed umap on the entire dataset.

# set.seed(123)
# system.time(glove_umap <- umap(glove_main, n_components = 2, metric = "cosine", n_neighbors = 20, min_dist = 0.1))
# saveRDS(glove_umap, file = "glove_umap.rds")

glove_umap <- readRDS("glove_umap.rds")

df_glove_umap <- as.data.frame(glove_umap$layout)

df_glove_umap$word <- row.names(df_glove_umap)

2D plot without labeling:

ggplot(df_glove_umap) +
  geom_point(aes(x = V1, y = V2), colour = 'blue', size = 0.05) + 
  labs(title = "Word embedding in 2D using UMAP")

Then I looked at some similarities. For the word “cozy” top similar words make total sense. However from the plot with most close words which includes not only top 10 words we can see that not all close words really semantically connected to “cozy”.

cozy<- glove_main["cozy", , drop = F]
cos_sim_cozy <- sim2(x = glove_main, y = cozy, method = "cosine", norm = "l2")
head(sort(cos_sim_cozy[,1], decreasing = T), 10)
##      cozy      warm     super      soft     Soooo     cycle        ha      cute 
## 1.0000000 0.3680378 0.3428902 0.3127455 0.3038562 0.2858981 0.2819521 0.2700653 
##   flowing  supposed 
## 0.2655902 0.2634320
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim_cozy[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words <- inner_join(x= df_glove_umap , y=select, by= "word")  

ggplot(selected_words, aes(x = V1, y = V2, colour = word == 'cozy')) + 
  geom_point(show.legend = FALSE) + 
  scale_color_manual(values = c('black', 'red')) +
  geom_text(aes(V1, V2, label = word), show.legend = FALSE, size = 3.5, vjust=-1.5, hjust=0) +
  labs(title = "GloVe word embedding of words related to 'cozy'") 

Top 10 words for “quality” are also quite meaningful, but here almost all the other words that are located close to the words (can be seen on the plot) are logically connected to it.

quality <- glove_main["quality", , drop = F]
cos_sim_quality <- sim2(x = glove_main, y = quality, method = "cosine", norm = "l2")
head(sort(cos_sim_quality[,1], decreasing = T), 10)
##   quality      good    fabric    design     great   overall  material      soft 
## 1.0000000 0.4723188 0.4612909 0.4170621 0.4001164 0.3932099 0.3885234 0.3421723 
## beautiful     looks 
## 0.3398234 0.3324088
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim_quality[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"

selected_words <- inner_join(x= df_glove_umap , y=select, by= "word") 

ggplot(selected_words, aes(x = V1, y = V2, colour = word == 'quality')) + 
  geom_point(show.legend = FALSE) + 
  scale_color_manual(values = c('black', 'red')) +
  geom_text(aes(V1, V2, label = word), show.legend = FALSE, size = 3.5, vjust=-1.5, hjust=0) +
  labs(title = "GloVe word embedding of words related to 'quality'") 

Situation with “size” is quite similar to “quality”.

size <- glove_main["size", , drop = F]
cos_sim_size <- sim2(x = glove_main, y = size , method = "cosine", norm = "l2")
head(sort(cos_sim_size [,1], decreasing = T), 10)
##      size     small    medium    petite   ordered        xs     large      true 
## 1.0000000 0.6439723 0.5655103 0.5422445 0.5387652 0.5283161 0.5133025 0.4956188 
##     usual       fit 
## 0.4917295 0.4900316
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim_size[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words <- inner_join(x= df_glove_umap , y=select, by= "word") 

ggplot(selected_words, aes(x = V1, y = V2, colour = word == 'size')) + 
  geom_point(show.legend = FALSE) + 
  scale_color_manual(values = c('black', 'red')) +
  geom_text(aes(V1, V2, label = word), show.legend = FALSE, size = 3.5, vjust=-1.5, hjust=0) +
  labs(title = "GloVe word embedding of words related to 'size'") 

After that i created a function to compare differences between exact 2 words.

similarity <- function(word1, word2){
  lsa::cosine(
    x=as.numeric(glove_dataframe[glove_dataframe$word==word1,1:150]),
    y=as.numeric(glove_dataframe[glove_dataframe$word==word2,1:150]))
}

And then I looked at some of them. For example, for “cozy” “comfortable” is more similar than “uncomfortable”, which makes perfect sense. In addition, when we look at “socks” and “sweatshirt” the similarity for the former is higher which may be due to the fact that people often get cozy at home and in winter they usually wear exactly the socks and not sweatshirts.

similarity("cozy", "comfortable")
##          [,1]
## [1,] 0.221187
similarity("cozy", "uncomfortable")
##            [,1]
## [1,] 0.05595327
similarity("cozy", "socks")
##            [,1]
## [1,] 0.04363225
similarity("cozy", "sweatshirt")
##            [,1]
## [1,] 0.08630144

From similarities of the word “quality” I can say that people (at least in our dataset) more often wrote about good quality rather than bad one in their reviews.

similarity("quality", "high")
##           [,1]
## [1,] 0.2900964
similarity("quality", "low")
##            [,1]
## [1,] 0.07108497
similarity("quality", "bad")
##              [,1]
## [1,] -0.001152466
similarity("quality", "good")
##           [,1]
## [1,] 0.4723188
similarity("quality", "great")
##           [,1]
## [1,] 0.4001164

If we look at similarities between “size” and “big” and “small”, we can see that “small” has higher association with “size” which can mean that people write more often about getting clothes smaller in size that getting bigger in size. I think it is quite logical since it is easier to notice that something is smaller, and if something is bigger one can think that it is just “oversize”.

similarity("size", "big")
##           [,1]
## [1,] 0.3019828
similarity("size", "small")
##           [,1]
## [1,] 0.6439723
similarity("size", "fit")
##           [,1]
## [1,] 0.4900316

After similarities I looked at some analogies. The first one makes total sense to me (getting “petite” and “medium” while excluding “big” from “size” is something very logical).

ex <- glove_main["size", , drop = FALSE] -
  glove_main["big", , drop = FALSE] +
  glove_main["fit", , drop = FALSE]

cos_sim_test <- sim2(x = glove_main, y = ex , method = "cosine", norm = "l2")
head(sort(cos_sim_test[,1], decreasing = T), 5)
##       fit      size perfectly    petite    medium 
## 0.7632463 0.6944099 0.4802457 0.4793153 0.4555791

When I do (vec)quality- (vec)low+ (vec)soft, I get “fabric”, “super” and “comfortable”, which again seems quite logical.

ex2 <-  glove_main["quality", , drop = FALSE] -
  glove_main["low", , drop = FALSE] +
  glove_main["soft", , drop = FALSE]

cos_sim_test <- sim2(x = glove_main, y = ex2 , method = "cosine", norm = "l2")
head(sort(cos_sim_test[,1], decreasing = T), 5)
##        soft     quality      fabric       super comfortable 
##   0.7106433   0.6863965   0.5037753   0.4550513   0.4366626

And with (vec)dress- (vec)style+ (vec)cheap I got “comfortable”, and “maternity” which is not so obvious but quite interesting result.

ex3 <-  glove_main["dress", , drop = FALSE] -
  glove_main["style", , drop = FALSE] +
  glove_main["cheap", , drop = FALSE]

cos_sim_test <- sim2(x = glove_main, y = ex3 , method = "cosine", norm = "l2")
head(sort(cos_sim_test[,1], decreasing = T), 5)
##       cheap       dress comfortable   maternity        made 
##   0.6611893   0.5001571   0.3395653   0.3181072   0.3076874

After that i did Machine Learning classification with WE.

colnames(glove_dataframe )
##   [1] "V1"   "V2"   "V3"   "V4"   "V5"   "V6"   "V7"   "V8"   "V9"   "V10" 
##  [11] "V11"  "V12"  "V13"  "V14"  "V15"  "V16"  "V17"  "V18"  "V19"  "V20" 
##  [21] "V21"  "V22"  "V23"  "V24"  "V25"  "V26"  "V27"  "V28"  "V29"  "V30" 
##  [31] "V31"  "V32"  "V33"  "V34"  "V35"  "V36"  "V37"  "V38"  "V39"  "V40" 
##  [41] "V41"  "V42"  "V43"  "V44"  "V45"  "V46"  "V47"  "V48"  "V49"  "V50" 
##  [51] "V51"  "V52"  "V53"  "V54"  "V55"  "V56"  "V57"  "V58"  "V59"  "V60" 
##  [61] "V61"  "V62"  "V63"  "V64"  "V65"  "V66"  "V67"  "V68"  "V69"  "V70" 
##  [71] "V71"  "V72"  "V73"  "V74"  "V75"  "V76"  "V77"  "V78"  "V79"  "V80" 
##  [81] "V81"  "V82"  "V83"  "V84"  "V85"  "V86"  "V87"  "V88"  "V89"  "V90" 
##  [91] "V91"  "V92"  "V93"  "V94"  "V95"  "V96"  "V97"  "V98"  "V99"  "V100"
## [101] "V101" "V102" "V103" "V104" "V105" "V106" "V107" "V108" "V109" "V110"
## [111] "V111" "V112" "V113" "V114" "V115" "V116" "V117" "V118" "V119" "V120"
## [121] "V121" "V122" "V123" "V124" "V125" "V126" "V127" "V128" "V129" "V130"
## [131] "V131" "V132" "V133" "V134" "V135" "V136" "V137" "V138" "V139" "V140"
## [141] "V141" "V142" "V143" "V144" "V145" "V146" "V147" "V148" "V149" "V150"
## [151] "word"
glove_dataframe <- select(glove_dataframe, word, everything())
colnames(glove_dataframe )
##   [1] "word" "V1"   "V2"   "V3"   "V4"   "V5"   "V6"   "V7"   "V8"   "V9"  
##  [11] "V10"  "V11"  "V12"  "V13"  "V14"  "V15"  "V16"  "V17"  "V18"  "V19" 
##  [21] "V20"  "V21"  "V22"  "V23"  "V24"  "V25"  "V26"  "V27"  "V28"  "V29" 
##  [31] "V30"  "V31"  "V32"  "V33"  "V34"  "V35"  "V36"  "V37"  "V38"  "V39" 
##  [41] "V40"  "V41"  "V42"  "V43"  "V44"  "V45"  "V46"  "V47"  "V48"  "V49" 
##  [51] "V50"  "V51"  "V52"  "V53"  "V54"  "V55"  "V56"  "V57"  "V58"  "V59" 
##  [61] "V60"  "V61"  "V62"  "V63"  "V64"  "V65"  "V66"  "V67"  "V68"  "V69" 
##  [71] "V70"  "V71"  "V72"  "V73"  "V74"  "V75"  "V76"  "V77"  "V78"  "V79" 
##  [81] "V80"  "V81"  "V82"  "V83"  "V84"  "V85"  "V86"  "V87"  "V88"  "V89" 
##  [91] "V90"  "V91"  "V92"  "V93"  "V94"  "V95"  "V96"  "V97"  "V98"  "V99" 
## [101] "V100" "V101" "V102" "V103" "V104" "V105" "V106" "V107" "V108" "V109"
## [111] "V110" "V111" "V112" "V113" "V114" "V115" "V116" "V117" "V118" "V119"
## [121] "V120" "V121" "V122" "V123" "V124" "V125" "V126" "V127" "V128" "V129"
## [131] "V130" "V131" "V132" "V133" "V134" "V135" "V136" "V137" "V138" "V139"
## [141] "V140" "V141" "V142" "V143" "V144" "V145" "V146" "V147" "V148" "V149"
## [151] "V150"
glove_dataframe[1:5, 2:11]
##                      V1          V2          V3         V4         V5
## Absolutely   0.23676578  0.28588050 -0.09662717  0.3581747  0.4320785
## wonderful   -0.08489634 -0.82225524 -0.49909018  0.1338856 -0.3778849
## silky        0.40627422 -0.30104367  0.15406109  0.8227255 -0.3933628
## sexy         0.17379889 -0.04015286  0.01113281 -1.2606894 -0.7595907
## comfortable  0.07595155  0.47015491  0.17285504 -0.5976691 -0.3170695
##                      V6         V7          V8         V9          V10
## Absolutely  -0.04710816 -0.2564801  0.39553601  0.2901069  0.147889982
## wonderful    0.04187202 -0.3162849  0.08928079  0.6025572  0.125961371
## silky       -0.74638879 -0.3467772 -0.41713776 -0.3513734  0.243877645
## sexy         0.18669632  0.1729868  0.67040581 -0.2681141 -0.009611733
## comfortable  0.16327063  0.3133347  0.14258458  0.5474783 -0.413188445
nrow(glove_dataframe)
## [1] 2878
ncol(glove_dataframe)
## [1] 151

Since I have 101 columns in the data frame and 150 dimensions of WE I adjusted the code.

embed <- matrix(NA, nrow=ndoc(Dfm), ncol=150) 

for (i in 1:ndoc(Dfm)){
  if (i %% 150 == 0) message(i, '/', ndoc(Dfm))
  vec <- as.numeric(Dfm[i,])
  doc_words <- featnames(Dfm)[vec>0] 
  embed_vec <- glove_dataframe[glove_dataframe$word %in% doc_words, 2:151]
  embed[i,] <- colMeans(embed_vec, na.rm=TRUE)
  if (nrow(embed_vec)==0) embed[i,] <- 0
} 
## 150/5000
## 300/5000
## 450/5000
## 600/5000
## 750/5000
## 900/5000
## 1050/5000
## 1200/5000
## 1350/5000
## 1500/5000
## 1650/5000
## 1800/5000
## 1950/5000
## 2100/5000
## 2250/5000
## 2400/5000
## 2550/5000
## 2700/5000
## 2850/5000
## 3000/5000
## 3150/5000
## 3300/5000
## 3450/5000
## 3600/5000
## 3750/5000
## 3900/5000
## 4050/5000
## 4200/5000
## 4350/5000
## 4500/5000
## 4650/5000
## 4800/5000
## 4950/5000
str(embed) 
##  num [1:5000, 1:150] 0.096 -0.0188 0.0368 0.0463 0.0201 ...
str(tot)
## 'data.frame':    5000 obs. of  3 variables:
##  $ X    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ text : chr  " Absolutely wonderful - silky and sexy and comfortable" " Love this dress!  it s sooo pretty.  i happened to find it in a store, and i m glad i did bc i never would hav"| __truncated__ "Some major design flaws I had such high hopes for this dress and really wanted it to work for me. i initially o"| __truncated__ "My favorite buy! I love, love, love this jumpsuit. it s fun, flirty, and fabulous! every time i wear it, i get "| __truncated__ ...
##  $ Liked: int  0 1 0 1 1 0 1 0 1 1 ...
prop.table(table(tot$Liked))
## 
##      0      1 
## 0.4482 0.5518

Then I did Ranger with estimated WE.

source("Function_CV2Ranger.R")
Function_CV2Ranger

class(embed)

ttt <- as.matrix(embed)
str(ttt)

data2 <- data.frame()
str(data2)

k <- 5
set.seed(123) 
folds <- cvFolds(NROW(embed), K=k)
str(folds)

class(tot$Liked)
y <- as.factor(tot$Liked)
table(y)

colnames(embed) <- paste0("x",1:ncol(embed))

Ranger_res <- Function_CV2Ranger(input=embed, dt=data2, k=5, DV=y, ML=ranger)

And after that I run BoW model. Since I had colnames in the dfm, I did not have to add them here.

z = as.matrix(Dfm) 
length(Dfm@Dimnames$features) 

colnames(z)

Ranger_res2 <- Function_CV2Ranger(input=z, dt=data2, k=5, DV=y, ML=ranger)

Finally, I compared the accuracy of 2 models and CV of BoW turned out to be better.

colMeans(Ranger_res[ , c(1, 2, 3)])
##               Accuracy Avg. Balanced Accuracy                Avg. F1 
##              0.7318000              0.7229683              0.7244648
colMeans(Ranger_res2[ , c(1, 2, 3)])
##               Accuracy Avg. Balanced Accuracy                Avg. F1 
##              0.7822000              0.7750812              0.7771670